<!doctype html>



  


<html class="theme-next mist use-motion" lang="zh-Hans,zh-hk,en,fr-FR,ru,de,ja,id,ko,default">
<head>
  <meta charset="UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>



<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />















  
  
  <link href="/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css" />




  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  
    
      
    

    
  

  
    
      
    

    
  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Monda:300,300italic,400,400italic,700,700italic|Roboto Slab:300,300italic,400,400italic,700,700italic|Lobster Two:300,300italic,400,400italic,700,700italic|PT Mono:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css" />

<link href="/css/main.css?v=5.1.0" rel="stylesheet" type="text/css" />


  <meta name="keywords" content="Python3,爬虫,Urllib2,正则表达式," />





  <link rel="alternate" href="http://blog.csdn.net/qq_21265915/rss/list" title="WinterSmileSB101 的个人房间" type="application/atom+xml" />




  <link rel="shortcut icon" type="image/x-icon" href="/images/myHeadImg.jpeg?v=5.1.0" />






<meta name="description" content="版权声明：本文为 wintersmilesb101 -（个人独立博客– http://wintersmilesb101.online 欢迎访问）博主原创文章，未经博主允许不得转载。

我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题：分析网址通过浏览器进入糗事百科首页，http://www.qiushibaike.com/你会看到如下界">
<meta property="og:type" content="article">
<meta property="og:title" content="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取">
<meta property="og:url" content="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/index.html">
<meta property="og:site_name" content="WinterSmileSB101 的个人房间">
<meta property="og:description" content="版权声明：本文为 wintersmilesb101 -（个人独立博客– http://wintersmilesb101.online 欢迎访问）博主原创文章，未经博主允许不得转载。

我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题：分析网址通过浏览器进入糗事百科首页，http://www.qiushibaike.com/你会看到如下界">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/34449674-file_1491644670774_410a.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/53875935-file_1491655849141_11631.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/85847293-file_1491656241405_70a9.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/50805057-file_1491657254172_8d4f.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/42316034-file_1491659592251_5971.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/84188870-file_1491662005565_12201.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/14164501-file_1491662403193_3fa9.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/99243932-file_1491663434712_16972.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-9/82590970-file_1491698178159_13ce.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-9/64724203-file_1491698502237_17e1a.png">
<meta property="og:image" content="http://on792ofrp.bkt.clouddn.com/17-4-9/48412357-file_1491704503266_13654.png">
<meta property="og:updated_time" content="2017-04-09T02:25:07.721Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取">
<meta name="twitter:description" content="版权声明：本文为 wintersmilesb101 -（个人独立博客– http://wintersmilesb101.online 欢迎访问）博主原创文章，未经博主允许不得转载。

我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题：分析网址通过浏览器进入糗事百科首页，http://www.qiushibaike.com/你会看到如下界">
<meta name="twitter:image" content="http://on792ofrp.bkt.clouddn.com/17-4-8/34449674-file_1491644670774_410a.png">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/',
    scheme: 'Mist',
    sidebar: {"position":"right","display":"post","offset":12,"offset_float":0,"b2t":true,"scrollpercent":true},
    fancybox: true,
    motion: true,
    duoshuo: {
      userId: '6376853978663093000',
      author: 'WinterSmileSB101'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/"/>





  <title> Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取 | WinterSmileSB101 的个人房间 </title>
</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="zh-Hans">

  














  
  
    
  

  <div class="container sidebar-position-right page-post-detail ">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/"  class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">WinterSmileSB101 的个人房间</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <h1 class="site-subtitle" itemprop="description">胆小认生，不易相处</h1>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br />
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/categories" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br />
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/about" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-user"></i> <br />
            
            关于
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/archives" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br />
            
            归档
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/tags" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br />
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-commonweal">
          <a href="/404.html" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-heartbeat"></i> <br />
            
            公益404
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br />
            
            搜索
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocapitalize="off" autocomplete="off" autocorrect="off"
             placeholder="搜索..." spellcheck="false"
             type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal " itemscope itemtype="http://schema.org/Article">
    <link itemprop="mainEntityOfPage" href="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="WinterSmileSB101">
      <meta itemprop="description" content="">
      <meta itemprop="image" content="http://on792ofrp.bkt.clouddn.com/17-3-22/29073846-file_1490159480452_d2de.jpg">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="WinterSmileSB101 的个人房间">
    </span>

    
      <header class="post-header">

        
        
          <h2 class="post-title" itemprop="name headline">
            
            
              
                Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取
              
            
          </h2>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2017-04-08T16:55:47+08:00">
                2017-04-08
              </time>
            

            
              <span class="post-meta-divider">|</span>
            

            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-check-o"></i>
              </span>
              
                <span class="post-meta-item-text">更新于</span>
              
              <time title="更新于" itemprop="dateModified" datetime="2017-04-09T10:25:07+08:00">
                2017-04-09
              </time>
            
          </span>

          
            <span class="post-category" >
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/爬虫/" itemprop="url" rel="index">
                    <span itemprop="name">爬虫</span>
                  </a>
                </span>

                
                
                  ， 
                
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/categories/爬虫/Python-爬虫/" itemprop="url" rel="index">
                    <span itemprop="name">Python 爬虫</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count ds-thread-count" data-thread-key="2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          
             <span id="/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/" class="leancloud_visitors" data-flag-title="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取">
               <span class="post-meta-divider">|</span>
               <span class="post-meta-item-icon">
                 <i class="fa fa-eye"></i>
               </span>
               
                 <span class="post-meta-item-text">阅读次数 </span>
               
                 <span class="leancloud-visitors-count"></span>
             </span>
          

          

          

          

        </div>
      </header>
    


    <div class="post-body" itemprop="articleBody">

      
      

      
        <blockquote>
<p>版权声明：本文为 wintersmilesb101 -（个人独立博客– <a href="http://wintersmilesb101.online">http://wintersmilesb101.online</a> 欢迎访问）博主原创文章，未经博主允许不得转载。</p>
</blockquote>
<h2 id="我们今天就一起来通过-Python3-自带库-Urllib-与正则表达式来抓取糗事百科。废话不多说，下面正题："><a href="#我们今天就一起来通过-Python3-自带库-Urllib-与正则表达式来抓取糗事百科。废话不多说，下面正题：" class="headerlink" title="我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题："></a>我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题：</h2><h3 id="分析网址"><a href="#分析网址" class="headerlink" title="分析网址"></a>分析网址</h3><p>通过浏览器进入糗事百科首页，<a href="http://www.qiushibaike.com/" target="_blank" rel="external">http://www.qiushibaike.com/</a><br>你会看到如下界面：<br><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/34449674-file_1491644670774_410a.png" alt="image"><br>然后按下F12，进入开发者模式，感觉貌似没什么特殊的嘛。<br>那么直接通过 <a href="http://www.qiushibaike.com/" target="_blank" rel="external">http://www.qiushibaike.com/</a> 网址进行爬取。</p>
<h3 id="准备工作"><a href="#准备工作" class="headerlink" title="准备工作"></a>准备工作</h3><p>既然是第一篇，那么必不可少的就是环境的搭建以及编辑器的选取。</p>
<ul>
<li>这里环境的搭建我就不多说了 廖雪峰<a href="http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000" target="_blank" rel="external">廖老师的教程</a>中说的很清楚,Python一点都不了解的童鞋可以先看看这个学习一下，我也整理了 PDF 以及 EPUB 版本，观看或者下载，<a href="http://www.kancloud.cn/smilesb101/python3_x" target="_blank" rel="external">地址</a></li>
<li>至于编辑器，我这里推荐 vscode，好看开源插件多,这里再介绍开发 python 时候的辅助插件，<a href="http://www.cnblogs.com/bloglkl/archive/2016/08/23/5797805.html" target="_blank" rel="external">地址</a><h3 id="新建项目文件夹"><a href="#新建项目文件夹" class="headerlink" title="新建项目文件夹"></a>新建项目文件夹</h3>任意找一个位置，只要你自己觉得舒服的地方新建一个项目文件夹，比如我的项目位置是：<figure class="highlight cmd"><table><tr><td class="gutter"><pre><div class="line">1</div></pre></td><td class="code"><pre><div class="line"><span class="function">E:\<span class="title">adt</span>-<span class="title">bundle</span>-<span class="title">windows</span>-<span class="title">x86_64</span>-20131030\<span class="title">python</span>\3.<span class="title">x</span>\<span class="title">projects</span>\<span class="title">demo</span></span></div></pre></td></tr></table></figure>
</li>
</ul>
<h3 id="新建文件"><a href="#新建文件" class="headerlink" title="新建文件"></a>新建文件</h3><p>我这里是使用的 vscode 选中文件夹右键通过vscode打开，而后在软件中的文件夹上右键新建文件，输入 <figure class="highlight plain"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div><div class="line">6</div><div class="line">7</div><div class="line">8</div><div class="line">9</div><div class="line">10</div></pre></td><td class="code"><pre><div class="line">然后输入如下语句：</div><div class="line">```python</div><div class="line"># -*- coding: utf-8 -*-</div><div class="line">import urllib.request</div><div class="line">import urllib</div><div class="line"></div><div class="line">url = &quot;http://www.qiushibaike.com&quot;</div><div class="line">response = urllib.request.urlopen(url)</div><div class="line">content = response.read().decode(&apos;utf-8&apos;)</div><div class="line">print(content)</div></pre></td></tr></table></figure></p>
<p>运行，出师不利啊，看看提示，说是没有响应。<br><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/53875935-file_1491655849141_11631.png" alt="image"></p>
<h4 id="访问网页无响应"><a href="#访问网页无响应" class="headerlink" title="访问网页无响应"></a>访问网页无响应</h4><p>其实就是网站的 UA 防护，一般的网站都要检查是否是浏览器在进行访问，所以我们这里的方式就是设置请求头，最简答的设置一个浏览器类型好了。<br>为上述代码添加如下代码,并且修改打开url 为 req：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div></pre></td><td class="code"><pre><div class="line">user_agent = <span class="string">'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'</span></div><div class="line">req = urllib.request.Request(url, headers=&#123;</div><div class="line">    <span class="string">'User-Agent'</span>: <span class="string">'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'</span></div><div class="line">&#125;)</div></pre></td></tr></table></figure></p>
<h3 id="再次运行-这里网页太长就不贴完了。"><a href="#再次运行-这里网页太长就不贴完了。" class="headerlink" title="再次运行,这里网页太长就不贴完了。"></a>再次运行,这里网页太长就不贴完了。</h3><p><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/85847293-file_1491656241405_70a9.png" alt="image"></p>
<h3 id="提取自己想要的信息"><a href="#提取自己想要的信息" class="headerlink" title="提取自己想要的信息"></a>提取自己想要的信息</h3><p>前面我们已经拿到了网页 html 想做什么都随便我们了，这里我们要使用这些 html 来获得我们想要的信息。比如这里我们要获取每条段子的文字或者图片链接<br>那么再次回到浏览器，按下 F12，点击 Elements 面板，效果如下：<br><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/50805057-file_1491657254172_8d4f.png" alt="image"></p>
<p>从上图可以发现，我们需要的信息在右边，1 位置是发布者，发布者头像等，2 位置就是我们所需要的文字内容了，3 位置就是我们的图片了，没有图片的话是没有 3 位置的 div 的。</p>
<h3 id="设计正则表达式"><a href="#设计正则表达式" class="headerlink" title="设计正则表达式"></a>设计正则表达式</h3><p>这里给出一个学习 Python3 re 模块正则表达式，对正则表达式不了解的可以看看，想要深入了解请自行百度正则表达式，<a href="http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html" target="_blank" rel="external">地址</a></p>
<p>我们要匹配的内容如下：<br><figure class="highlight html"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div><div class="line">6</div><div class="line">7</div><div class="line">8</div><div class="line">9</div><div class="line">10</div><div class="line">11</div></pre></td><td class="code"><pre><div class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"/article/118848511"</span> <span class="attr">target</span>=<span class="string">"_blank"</span> <span class="attr">class</span>=<span class="string">"contentHerf"</span>&gt;</span></div><div class="line"><span class="tag">&lt;<span class="name">div</span> <span class="attr">class</span>=<span class="string">"content"</span>&gt;</span></div><div class="line"><span class="tag">&lt;<span class="name">span</span>&gt;</span>今天去相亲了！去之前，媒人拿出了张照片 说女方喜欢拍古装艺术照 我一看照片 虽然只露出眼睛和眉毛，但从我阅人无数的经验来看，此女子也应该还不错！可回来之后，我生气的质问媒人 完全跟照片不一样啊 媒人说.....你是不是把照片拿反了.....<span class="tag">&lt;/<span class="name">span</span>&gt;</span></div><div class="line"><span class="tag">&lt;/<span class="name">div</span>&gt;</span></div><div class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></div><div class="line"></div><div class="line"><span class="tag">&lt;<span class="name">div</span> <span class="attr">class</span>=<span class="string">"thumb"</span>&gt;</span></div><div class="line"><span class="tag">&lt;<span class="name">a</span> <span class="attr">href</span>=<span class="string">"/article/118848511"</span> <span class="attr">target</span>=<span class="string">"_blank"</span>&gt;</span></div><div class="line"><span class="tag">&lt;<span class="name">img</span> <span class="attr">src</span>=<span class="string">"http://pic.qiushibaike.com/system/pictures/11884/118848511/medium/app118848511.jpg"</span> <span class="attr">alt</span>=<span class="string">"完全跟照片不一样啊"</span>&gt;</span></div><div class="line"><span class="tag">&lt;/<span class="name">a</span>&gt;</span></div><div class="line"><span class="tag">&lt;/<span class="name">div</span>&gt;</span></div></pre></td></tr></table></figure></p>
<p>先来匹配文字部分吧，<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div></pre></td><td class="code"><pre><div class="line"><span class="string">'&lt;div.*?class="content"&gt;\n*?&lt;span.*?&lt;/span&gt;\n*?&lt;/div&gt;'</span></div></pre></td></tr></table></figure></p>
<p>使用 re 模块进行正则匹配，添加 <figure class="highlight plain"><figcaption><span>re``` 引用，并且追加下面代码到文件末尾</span></figcaption><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div></pre></td><td class="code"><pre><div class="line">```python</div><div class="line">pattern = re.compile(&apos;&lt;div.*?class=&quot;content&quot;&gt;\n*?&lt;span.*?&lt;/span&gt;\n*?&lt;/div&gt;&apos;)</div><div class="line">items = re.findall(pattern, content)</div><div class="line">for item in items:</div><div class="line">    print(item)</div></pre></td></tr></table></figure></p>
<p>运行：可以看到我们已经取出需要的标签了</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/42316034-file_1491659592251_5971.png" alt="image"></p>
<p>不过可以看到，这里我们获取到了只是标签的位置，这里我们需要的是内容，所以需要去掉我们不需要的部分，通过字符串的 replace(old,new) 可以实现这一点，修改代码如下：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div></pre></td><td class="code"><pre><div class="line">b = <span class="string">'&lt;div class="content"&gt;&lt;span&gt;'</span></div><div class="line"><span class="keyword">for</span> item <span class="keyword">in</span> items:</div><div class="line">    temp = item.replace(<span class="string">'\n'</span>, <span class="string">''</span>)</div><div class="line">    temp = temp.replace(<span class="string">'&lt;/span&gt;&lt;/div&gt;'</span>, <span class="string">''</span>)</div><div class="line">    print(temp.replace(b, <span class="string">''</span>)+<span class="string">'\n'</span>)</div></pre></td></tr></table></figure></p>
<p>再次运行：</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/84188870-file_1491662005565_12201.png" alt="image"></p>
<p>可以看到已经是纯文本了。</p>
<h3 id="获取图片段子"><a href="#获取图片段子" class="headerlink" title="获取图片段子"></a>获取图片段子</h3><p>编写正则表达式</p>
<figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div></pre></td><td class="code"><pre><div class="line"><span class="string">'&lt;a.*?&gt;\n&lt;img.*?&gt;'</span></div></pre></td></tr></table></figure>
<p>写入代码：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div></pre></td><td class="code"><pre><div class="line"></div><div class="line"></div></pre></td></tr></table></figure></p>
<p>运行：</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/14164501-file_1491662403193_3fa9.png" alt="image"></p>
<p>和上面一样，获取到了标签，但是我们需要准确的数据，这里我们再使用一次正则表达式匹配网址：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div></pre></td><td class="code"><pre><div class="line"><span class="string">'http:.*[JPEG|jpg]'</span></div></pre></td></tr></table></figure></p>
<p>修改代码如下：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div></pre></td><td class="code"><pre><div class="line"><span class="keyword">for</span> item <span class="keyword">in</span> items:</div><div class="line">    resp = re.compile(<span class="string">'http:.*[JPEG|jpg]'</span>)</div><div class="line">    res = resp.findall(item)</div><div class="line">    print(res[<span class="number">0</span>])</div></pre></td></tr></table></figure></p>
<p>运行：</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-8/99243932-file_1491663434712_16972.png" alt="image"></p>
<p>图片地址取出来了</p>
<h3 id="一次完成工作"><a href="#一次完成工作" class="headerlink" title="一次完成工作"></a>一次完成工作</h3><p>下面我们来把上面的工作合到一起完成,这里就要提到一个概念 分组，<a href="http://blog.csdn.net/seetheworld518/article/details/49302829" target="_blank" rel="external">原文</a><br>概念解释如下：</p>
<h4 id="正则表达式分组"><a href="#正则表达式分组" class="headerlink" title="正则表达式分组"></a>正则表达式分组</h4><p>分组就是用一对圆括号“()”括起来的正则表达式，匹配出的内容就表示一个分组。从正则表达式的左边开始看，看到的第一个左括号“(”表示第一个分组，第二个表示第二个分组，依次类推，需要注意的是，有一个隐含的全局分组（就是0），就是整个正则表达式。<br>分完组以后，要想获得某个分组的内容，直接使用group(num)和groups()函数去直接提取就行。</p>
<p>例如：提取代码中的超链接中的文本<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div><div class="line">6</div></pre></td><td class="code"><pre><div class="line"><span class="meta">&gt;&gt;&gt; </span>s=<span class="string">'&lt;div&gt;&lt;a href="https://support.google.com/chrome/?p=ui_hotword_search" target="_blank"&gt;更多&lt;/a&gt;&lt;p&gt;dfsl&lt;/p&gt;&lt;/div&gt;'</span></div><div class="line"><span class="meta">&gt;&gt;&gt; </span><span class="keyword">print</span> re.search(<span class="string">r'&lt;a.*&gt;(.*)&lt;/a&gt;'</span>,s).group(<span class="number">1</span>)</div><div class="line">更多</div><div class="line">或者</div><div class="line"><span class="meta">&gt;&gt;&gt; </span><span class="keyword">print</span> re.match(<span class="string">r'.*&lt;a.*&gt;(.*)&lt;/a&gt;'</span>,s).group(<span class="number">1</span>)</div><div class="line">更多</div></pre></td></tr></table></figure></p>
<p>按照上面的分组匹配以后，我们就可以拿到我们想拿到的字串，但是如果我们正则表达式中括号比较多，那我们在拿我们想要的字串时，要去挨个数我们想要的字串时第几个括号，这样会很麻烦，这个时候Python又引入了另一种分组，那就是命名分组，上面的叫无名分组。</p>
<h4 id="命名分组"><a href="#命名分组" class="headerlink" title="命名分组"></a>命名分组</h4><p>命名分组就是给具有默认分组编号的组另外再给一个别名。命名分组的语法格式如下：</p>
<p>(?P<name>正则表达式)#name是一个合法的标识符<br>如：提取字符串中的ip地址<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div></pre></td><td class="code"><pre><div class="line"><span class="meta">&gt;&gt;&gt; </span>s = <span class="string">"ip='230.192.168.78',version='1.0.0'"</span></div><div class="line"><span class="meta">&gt;&gt;&gt; </span>re.search(<span class="string">r"ip='(?P&lt;ip&gt;\d+\.\d+\.\d+\.\d+).*"</span>, s)</div><div class="line"><span class="meta">&gt;&gt;&gt; </span>res.group(<span class="string">'ip'</span>)<span class="comment">#通过命名分组引用分组</span></div><div class="line"><span class="string">'230.192.168.78'</span></div></pre></td></tr></table></figure></name></p>
<h4 id="存在的一个坑"><a href="#存在的一个坑" class="headerlink" title="存在的一个坑"></a>存在的一个坑</h4><p>这里有一个匹配模式的坑，关于匹配模式：</p>
<h5 id="编译标志（匹配模式）"><a href="#编译标志（匹配模式）" class="headerlink" title="编译标志（匹配模式）"></a>编译标志（匹配模式）</h5><p>re.compile() 函数还接受可选的第二个参数，用以设置匹配模式。可选的匹配模式有：</p>
<ul>
<li>re.IGNORECASE：忽略大小写，同 re.I。</li>
<li>re.MULTILINE：多行模式，改变^和$的行为，同 - re.M。</li>
<li>re.DOTALL：点任意匹配模式，让’.’可以匹配包括’\n’在内的任意字符，同 re.S。</li>
<li>re.LOCALE：使预定字符类 \w \W \b \B \s \S 取决于当前区域设定， 同 re.L。</li>
<li>re.ASCII：使 \w \W \b \B \s \S 只匹配 ASCII 字符，而不是 Unicode 字符，同 re.A。</li>
<li>re.VERBOSE：详细模式。这个模式下正则表达式可以是多行，忽略空白字符，并可以加入注释。主要是为了让正则表达式更易读，同re.X。例如，以下两个正则表达式是等价的：<figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div></pre></td><td class="code"><pre><div class="line">a = re.compile(<span class="string">r"""\d +  # the integral part </span></div><div class="line">                   \.    # the decimal point </div><div class="line">                   \d *  # some fractional digits""", re.X)  </div><div class="line">b = re.compile(<span class="string">r"\d+\.\d*"</span>)</div></pre></td></tr></table></figure>
</li>
</ul>
<p>看到这里是不是很兴奋啊？这样可以简化很多操作，那么来试试：<br><figure class="highlight python"><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div></pre></td><td class="code"><pre><div class="line">pattern = re.compile(<span class="string">'&lt;div.*?class="content"&gt;.*?&lt;span&gt;(.*?)&lt;/span&gt;.*?&lt;/div&gt;'</span>, re.S)</div><div class="line">items = re.findall(pattern, content)</div></pre></td></tr></table></figure></p>
<p>还没等运行呢，就报错了。。</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-9/82590970-file_1491698178159_13ce.png" alt="image"></p>
<p>神马，说不存在？我去，什么意思，<a href="https://docs.python.org/3.7/library/re.html?highlight=re#module-re" target="_blank" rel="external">官方文档</a>莫非是在逗我吗？<br>仔细研究，搞了半天，扫描了 3 遍官方文档之后，终于发现了，你丫的藏的这么深！</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-9/64724203-file_1491698502237_17e1a.png" alt="image"></p>
<p>原来在 Python 3.6 之后，把以前的 re.S 等等的 flag 全部转移到 RegexFlag 中了，所以我们要改变用法 把 <figure class="highlight plain"><figcaption><span>改写成 ```re.RegexFlag.S```, 这次没有报错误了，运行(perfect nice！)：</span></figcaption><table><tr><td class="gutter"><pre><div class="line">1</div><div class="line">2</div><div class="line">3</div><div class="line">4</div><div class="line">5</div><div class="line">6</div><div class="line">7</div><div class="line">8</div><div class="line">9</div><div class="line">10</div><div class="line">11</div><div class="line">12</div><div class="line">13</div><div class="line">14</div><div class="line">15</div><div class="line">16</div><div class="line">17</div><div class="line">18</div><div class="line">19</div><div class="line">20</div><div class="line">21</div><div class="line">22</div><div class="line">23</div><div class="line">24</div><div class="line">25</div><div class="line">26</div><div class="line">27</div><div class="line">28</div><div class="line">29</div><div class="line">30</div><div class="line">31</div><div class="line">32</div><div class="line">33</div></pre></td><td class="code"><pre><div class="line"></div><div class="line">![image](http://on792ofrp.bkt.clouddn.com/17-4-9/23522085-file_1491698778621_16868.png)</div><div class="line"></div><div class="line"></div><div class="line">### 我们的代码修改</div><div class="line">于是我们根据分组以及模式匹配就可以非常简单的取到对应位置的正则表达式的值，修改正则表达式如下，现在我们一口气获取到内容与图片,最终代码：</div><div class="line">```python</div><div class="line"># -*- coding: utf-8 -*-</div><div class="line">import urllib.request</div><div class="line">import urllib</div><div class="line">import re</div><div class="line"></div><div class="line">url = &quot;http://www.qiushibaike.com/imgrank/&quot;</div><div class="line">user_agent = &apos;Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)&apos;</div><div class="line">req = urllib.request.Request(url, headers=&#123;</div><div class="line">    &apos;User-Agent&apos;: &apos;Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)&apos;</div><div class="line">&#125;)</div><div class="line">response = urllib.request.urlopen(req)</div><div class="line">content = response.read().decode(&apos;utf-8&apos;)</div><div class="line">print(content)</div><div class="line">patternPic = re.compile(&apos;&lt;a.*?&gt;\n&lt;img src=&quot;(.*?)&quot;.*?&gt;&apos;)</div><div class="line">pattern = re.compile(&apos;&lt;div.*?class=&quot;content&quot;&gt;.*?&lt;span&gt;(.*?)&lt;/span&gt;.*?&lt;/a&gt;&apos;+&apos;(.*?&lt;div.*?&quot;stats&quot;.*?&lt;/div&gt;)&apos;, re.RegexFlag.S)</div><div class="line">items = re.findall(pattern, content)</div><div class="line">for item in items:</div><div class="line">    print(isinstance(item, str))</div><div class="line">    print()</div><div class="line">    if re.search(&apos;img&apos;, item[1]):</div><div class="line">        #再次匹配</div><div class="line">        patternA = re.compile(&apos;&lt;a.*?&gt;.*?&lt;img src=&quot;(.*?)&quot;.*?&gt;&apos;, re.RegexFlag.S)</div><div class="line">        img = patternA.findall(item[1])</div><div class="line">        print(&apos;段子：==&gt; &apos;+item[0], &apos;\n\n&apos;, &apos;段子图片：==&gt; &apos;+img[0]+&apos;\n\n\n&apos;)</div><div class="line">    else:</div><div class="line">        print(&apos;段子：==&gt; &apos;+item[0], &apos;\n\n\n&apos;)</div></pre></td></tr></table></figure></p>
<p>效果如下：</p>
<p><img src="http://on792ofrp.bkt.clouddn.com/17-4-9/48412357-file_1491704503266_13654.png" alt="image"></p>
<p>那么我们的爬虫就算是完成了，你也可以完善一下，比如说爬取用户的信息以及评论等<br>最后欢迎交流学习。</p>

      
    </div>

    <div>
      
        
<div id="wechat_subscriber" style="display: block; padding: 10px 0; margin: 20px auto; width: 100%; text-align: center">
    <img id="wechat_subscriber_qcode" src="http://on792ofrp.bkt.clouddn.com/17-3-22/89969433-file_1490159480281_107f8.jpg" alt="WinterSmileSB101 wechat" style="width: 200px; max-width: 100%;"/>
    <div>欢迎关注我的微信公众帐号，不定期推送各种技术文章</div>
</div>


      
    </div>

    <div>
      
        

      
    </div>

    <div>
      
        
  <ul class="post-copyright">
    <li class="post-copyright-author">
      <strong>本文作者：</strong>
      WinterSmileSB101
    </li>
    <li class="post-copyright-link">
      <strong>本文链接：</strong>
      <a href="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/" title="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取">http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/</a>
    </li>
    <li class="post-copyright-license">
      <strong>版权声明： </strong>
      本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/3.0/" rel="external nofollow" target="_blank">CC BY-NC-SA 3.0</a> 许可协议。转载请注明出处！
    </li>
  </ul>


      
    </div>

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/tags/Python3/" rel="tag"># Python3</a>
          
            <a href="/tags/爬虫/" rel="tag"># 爬虫</a>
          
            <a href="/tags/Urllib2/" rel="tag"># Urllib2</a>
          
            <a href="/tags/正则表达式/" rel="tag"># 正则表达式</a>
          
        </div>
      

      
        
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/2017/04/08/Python3.7 爬虫（三）使用 Urllib2 与 BeautifulSoup 爬取网易云音乐歌单/" rel="next" title="Python3.7 爬虫（三）使用 Urllib2 与 BeautifulSoup4 爬取网易云音乐歌单">
                <i class="fa fa-chevron-left"></i> Python3.7 爬虫（三）使用 Urllib2 与 BeautifulSoup4 爬取网易云音乐歌单
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
              <a href="/2017/04/08/Python3.7 爬虫（二）使用 Urllib2 与 BeautifulSoup 抓取解析网页/" rel="prev" title="Python3.7 爬虫（二）使用 Urllib2 与 BeautifulSoup4 抓取解析网页">
                Python3.7 爬虫（二）使用 Urllib2 与 BeautifulSoup4 抓取解析网页 <i class="fa fa-chevron-right"></i>
              </a>
            
          </div>
        </div>
      

      
      
    </footer>
  </article>



    <div class="post-spread">
      
        <div class="ds-share flat" data-thread-key="2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/"
     data-title="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取"
     data-content=""
     data-url="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/">
  <div class="ds-share-inline">
    <ul  class="ds-share-icons-16">

      <li data-toggle="ds-share-icons-more"><a class="ds-more" href="javascript:void(0);">分享到：</a></li>
      <li><a class="ds-weibo" href="javascript:void(0);" data-service="weibo">微博</a></li>
      <li><a class="ds-qzone" href="javascript:void(0);" data-service="qzone">QQ空间</a></li>
      <li><a class="ds-qqt" href="javascript:void(0);" data-service="qqt">腾讯微博</a></li>
      <li><a class="ds-wechat" href="javascript:void(0);" data-service="wechat">微信</a></li>

    </ul>
    <div class="ds-share-icons-more">
    </div>
  </div>
</div>
      
    </div>
  </div>


          </div>
          


          
  <div class="comments" id="comments">
    
      <div class="ds-thread" data-thread-key="2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/"
           data-title="Python3.7 爬虫（一）使用 Urllib2 与正则表达式抓取" data-url="http://WinterSmileSB101.online/2017/04/08/Python3.7 爬虫（一）使用 Urllib 与正则表达式抓取/">
      </div>
    
  </div>


        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap" >
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview sidebar-panel">
        <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
          <img class="site-author-image" itemprop="image"
               src="http://on792ofrp.bkt.clouddn.com/17-3-22/29073846-file_1490159480452_d2de.jpg"
               alt="WinterSmileSB101" />
          <p class="site-author-name" itemprop="name">WinterSmileSB101</p>
           
              <p class="site-description motion-element" itemprop="description"></p>
           
        </div>
        <nav class="site-state motion-element">

          
            <div class="site-state-item site-state-posts">
              <a href="/archives">
                <span class="site-state-item-count">52</span>
                <span class="site-state-item-name">日志</span>
              </a>
            </div>
          

          
            
            
            <div class="site-state-item site-state-categories">
              <a href="/categories/index.html">
                <span class="site-state-item-count">26</span>
                <span class="site-state-item-name">分类</span>
              </a>
            </div>
          

          
            
            
            <div class="site-state-item site-state-tags">
              <a href="/tags/index.html">
                <span class="site-state-item-count">113</span>
                <span class="site-state-item-name">标签</span>
              </a>
            </div>
          

        </nav>

        
          <div class="feed-link motion-element">
            <a href="http://blog.csdn.net/qq_21265915/rss/list" rel="alternate">
              <i class="fa fa-rss"></i>
              RSS
            </a>
          </div>
        

        <!--自己写的社交链接-->
        <div class="links-of-author motion-element">
         <span class="links-of-author-item">
         <a href="https://github.com/WinterSmileSB101" title="Github">
         <i class="fa fa-fw fa-github fa-lg"></i>
         </a>
         </span>
         <span class="links-of-author-item">
                  <a href="http://weibo.com/5602632941/profile?rightmod=1&wvr=6&mod=personinfo&is_all=1" title="微博">
                  <i class="fa fa-fw fa-weibo fa-lg"></i>
                  </a>
                  </span>
         <span class="links-of-author-item">
         <a href="http://www.jianshu.com/users/73344bc7e890/timeline" title="简书">
         <i class="fa fa-fw fa-bookmark fa-lg"></i>
         </a>
         </span>
<br />
        <span class="links-of-author-item">
                 <a href="https://www.douban.com/people/159359470/" title="豆瓣">
                 <i class="fa fa-fw fa-newspaper-o fa-lg"></i>
                 </a>
                 </span>
        <span class="links-of-author-item">
                 <a href="http://blog.csdn.net/qq_21265915" title="CSDN博客">
                 <i class="fa fa-fw fa-bug fa-lg"></i>
                 </a>
                 </span>
        </div>
        <!--自己写的社交链接-->

        
        

        
        

        


      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#我们今天就一起来通过-Python3-自带库-Urllib-与正则表达式来抓取糗事百科。废话不多说，下面正题："><span class="nav-number">1.</span> <span class="nav-text">我们今天就一起来通过 Python3 自带库 Urllib 与正则表达式来抓取糗事百科。废话不多说，下面正题：</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#分析网址"><span class="nav-number">1.1.</span> <span class="nav-text">分析网址</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#准备工作"><span class="nav-number">1.2.</span> <span class="nav-text">准备工作</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#新建项目文件夹"><span class="nav-number">1.3.</span> <span class="nav-text">新建项目文件夹</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#新建文件"><span class="nav-number">1.4.</span> <span class="nav-text">新建文件</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#访问网页无响应"><span class="nav-number">1.4.1.</span> <span class="nav-text">访问网页无响应</span></a></li></ol></li><li class="nav-item nav-level-3"><a class="nav-link" href="#再次运行-这里网页太长就不贴完了。"><span class="nav-number">1.5.</span> <span class="nav-text">再次运行,这里网页太长就不贴完了。</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#提取自己想要的信息"><span class="nav-number">1.6.</span> <span class="nav-text">提取自己想要的信息</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#设计正则表达式"><span class="nav-number">1.7.</span> <span class="nav-text">设计正则表达式</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#获取图片段子"><span class="nav-number">1.8.</span> <span class="nav-text">获取图片段子</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#一次完成工作"><span class="nav-number">1.9.</span> <span class="nav-text">一次完成工作</span></a><ol class="nav-child"><li class="nav-item nav-level-4"><a class="nav-link" href="#正则表达式分组"><span class="nav-number">1.9.1.</span> <span class="nav-text">正则表达式分组</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#命名分组"><span class="nav-number">1.9.2.</span> <span class="nav-text">命名分组</span></a></li><li class="nav-item nav-level-4"><a class="nav-link" href="#存在的一个坑"><span class="nav-number">1.9.3.</span> <span class="nav-text">存在的一个坑</span></a><ol class="nav-child"><li class="nav-item nav-level-5"><a class="nav-link" href="#编译标志（匹配模式）"><span class="nav-number">1.9.3.1.</span> <span class="nav-text">编译标志（匹配模式）</span></a></li></ol></li></ol></li></ol></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      
        <div class="back-to-top">
          <i class="fa fa-arrow-up"></i>
          
            <span id="scrollpercent"><span>0</span>%</span>
          
        </div>
      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright" >
  
  &copy;  2017.3.20 - 
  <span itemprop="copyrightYear">2017</span>
  <span class="with-love">
    <i class="fa fa-heart"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">Powered By - WinterSmileSB101</span>
</div>


<div class="powered-by">
    个人专属
</div>

<div class="theme-info">
  博客 -
  WinterSmileSB101
</div>


        

        
      </div>
    </footer>

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  


  



  
  <script type="text/javascript" src="/lib/jquery/index.js?v=2.1.3"></script>

  
  <script type="text/javascript" src="/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>

  
  <script type="text/javascript" src="/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>

  
  <script type="text/javascript" src="/lib/velocity/velocity.min.js?v=1.2.1"></script>

  
  <script type="text/javascript" src="/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>

  
  <script type="text/javascript" src="/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>

  
  <script type="text/javascript" src="/lib/canvas-nest/canvas-nest.min.js"></script>


  


  <script type="text/javascript" src="/js/src/utils.js?v=5.1.0"></script>

  <script type="text/javascript" src="/js/src/motion.js?v=5.1.0"></script>



  
  

  
  <script type="text/javascript" src="/js/src/scrollspy.js?v=5.1.0"></script>
<script type="text/javascript" src="/js/src/post-details.js?v=5.1.0"></script>



  


  <script type="text/javascript" src="/js/src/bootstrap.js?v=5.1.0"></script>



  

  
    
  

  <script type="text/javascript">
    var duoshuoQuery = {short_name:"wintersmilesb101"};
    (function() {
      var ds = document.createElement('script');
      ds.type = 'text/javascript';ds.async = true;
      ds.id = 'duoshuo-script';
      ds.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + '//static.duoshuo.com/embed.js';
      ds.charset = 'UTF-8';
      (document.getElementsByTagName('head')[0]
      || document.getElementsByTagName('body')[0]).appendChild(ds);
    })();
  </script>

  
    
      
      <script src="/lib/ua-parser-js/dist/ua-parser.min.js?v=0.7.9"></script>
      <script src="/js/src/hook-duoshuo.js?v=5.1.0"></script>
    
    
    <script src="/lib/ua-parser-js/dist/ua-parser.min.js?v=0.7.9"></script>
    <script src="/js/src/hook-duoshuo.js"></script>
  
















  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length == 0) {
      search_path = "search.xml";
    }
    var path = "/" + search_path;
    // monitor main search box;

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.popup').toggle();
    }
    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';
      $.ajax({
        url: path,
        dataType: "xml",
        async: true,
        success: function( xmlResponse ) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = $( "entry", xmlResponse ).map(function() {
            return {
              title: $( "title", this ).text(),
              content: $("content",this).text(),
              url: $( "url" , this).text()
            };
          }).get();
          var $input = document.getElementById(search_id);
          var $resultContent = document.getElementById(content_id);
          $input.addEventListener('input', function(){
            var matchcounts = 0;
            var str='<ul class=\"search-result-list\">';
            var keywords = this.value.trim().toLowerCase().split(/[\s\-]+/);
            $resultContent.innerHTML = "";
            if (this.value.trim().length > 1) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var content_index = [];
                var data_title = data.title.trim().toLowerCase();
                var data_content = data.content.trim().replace(/<[^>]+>/g,"").toLowerCase();
                var data_url = decodeURIComponent(data.url);
                var index_title = -1;
                var index_content = -1;
                var first_occur = -1;
                // only match artiles with not empty titles and contents
                if(data_title != '') {
                  keywords.forEach(function(keyword, i) {
                    index_title = data_title.indexOf(keyword);
                    index_content = data_content.indexOf(keyword);
                    if( index_title >= 0 || index_content >= 0 ){
                      isMatch = true;
                      if (i == 0) {
                        first_occur = index_content;
                      }
                    }

                  });
                }
                // show search results
                if (isMatch) {
                  matchcounts += 1;
                  str += "<li><a href='"+ data_url +"' class='search-result-title'>"+ data_title +"</a>";
                  var content = data.content.trim().replace(/<[^>]+>/g,"");
                  if (first_occur >= 0) {
                    // cut out 100 characters
                    var start = first_occur - 20;
                    var end = first_occur + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if(start == 0){
                      end = 50;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    var match_content = content.substring(start, end);
                    // highlight all keywords
                    keywords.forEach(function(keyword){
                      var regS = new RegExp(keyword, "gi");
                      match_content = match_content.replace(regS, "<b class=\"search-keyword\">"+keyword+"</b>");
                    });

                    str += "<p class=\"search-result\">" + match_content +"...</p>"
                  }
                  str += "</li>";
                }
              })};
            str += "</ul>";
            if (matchcounts == 0) { str = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>' }
            if (keywords == "") { str = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>' }
            $resultContent.innerHTML = str;
          });
          proceedsearch();
        }
      });}

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched == false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(function(e){
      $('.popup').hide();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    });
    $('.popup').click(function(e){
      e.stopPropagation();
    });
  </script>





  

  
  <script src="https://cdn1.lncld.net/static/js/av-core-mini-0.6.1.js"></script>
  <script>AV.initialize("cOFi0858xVYxKW1wnErxqEra-gzGzoHsz", "7LaqqR82XnjzTbkv5eCKw5aW");</script>
  <script>
    function showTime(Counter) {
      var query = new AV.Query(Counter);
      var entries = [];
      var $visitors = $(".leancloud_visitors");

      $visitors.each(function () {
        entries.push( $(this).attr("id").trim() );
      });

      query.containedIn('url', entries);
      query.find()
        .done(function (results) {
          var COUNT_CONTAINER_REF = '.leancloud-visitors-count';

          if (results.length === 0) {
            $visitors.find(COUNT_CONTAINER_REF).text(0);
            return;
          }

          for (var i = 0; i < results.length; i++) {
            var item = results[i];
            var url = item.get('url');
            var time = item.get('time');
            var element = document.getElementById(url);

            $(element).find(COUNT_CONTAINER_REF).text(time);
          }
          for(var i = 0; i < entries.length; i++) {
            var url = entries[i];
            var element = document.getElementById(url);
            var countSpan = $(element).find(COUNT_CONTAINER_REF);
            if( countSpan.text() == '') {
              countSpan.text(0);
            }
          }
        })
        .fail(function (object, error) {
          console.log("Error: " + error.code + " " + error.message);
        });
    }

    function addCount(Counter) {
      var $visitors = $(".leancloud_visitors");
      var url = $visitors.attr('id').trim();
      var title = $visitors.attr('data-flag-title').trim();
      var query = new AV.Query(Counter);

      query.equalTo("url", url);
      query.find({
        success: function(results) {
          if (results.length > 0) {
            var counter = results[0];
            counter.fetchWhenSave(true);
            counter.increment("time");
            counter.save(null, {
              success: function(counter) {
                var $element = $(document.getElementById(url));
                $element.find('.leancloud-visitors-count').text(counter.get('time'));
              },
              error: function(counter, error) {
                console.log('Failed to save Visitor num, with error message: ' + error.message);
              }
            });
          } else {
            var newcounter = new Counter();
            /* Set ACL */
            var acl = new AV.ACL();
            acl.setPublicReadAccess(true);
            acl.setPublicWriteAccess(true);
            newcounter.setACL(acl);
            /* End Set ACL */
            newcounter.set("title", title);
            newcounter.set("url", url);
            newcounter.set("time", 1);
            newcounter.save(null, {
              success: function(newcounter) {
                var $element = $(document.getElementById(url));
                $element.find('.leancloud-visitors-count').text(newcounter.get('time'));
              },
              error: function(newcounter, error) {
                console.log('Failed to create');
              }
            });
          }
        },
        error: function(error) {
          console.log('Error:' + error.code + " " + error.message);
        }
      });
    }

    $(function() {
      var Counter = AV.Object.extend("Counter");
      if ($('.leancloud_visitors').length == 1) {
        addCount(Counter);
      } else if ($('.post-title-link').length > 1) {
        showTime(Counter);
      }
    });
  </script>



  

  

  

</body>
</html>
